library(tidyverse)
library(viridis)
library(ggridges)
library(patchwork)
library(readxl)
library(leaflet)
library(plotly)
library(scales)
library(ggplot2)

DATA CLEANING (HUGO)

years_1 <- c(1900:2012, 2014)
years_2 <- c(2015:2019)
importing_data = function(x){
 
  if(str_detect(x, str_c(years_1, collapse = "|"))) {
  read_csv(x, na = c("NULL", "", "0"), col_types = "cicccciiiicc") 
  } 
  
  else if(str_detect(x, str_c(years_2, collapse = "|"))){
    read_csv(x, na = c("NULL", "", "0"), col_types = "cccicccccccccccccccccciiiiccc")
  }
}
boston_df <- 
  tibble(list.files("data", full.names = TRUE)) %>% 
  setNames("file_name") %>% 
  mutate(data = map(file_name, importing_data)) %>% 
  unnest(data) %>% 
  mutate(year = readr::parse_number(file_name),
         city = coalesce(city, residence),
         display_name = str_replace_all(display_name, "[^a-zA-Z0-9]", " ")) %>% 
  mutate(country_residence = replace(country_residence, country_residence == "AHO", "Netherland Antilles")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ALB", "Albania")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ALG", "Algeria")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "AND", "Andorra")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ARG", "Argentina")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Argenti", "Argentina")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "AUS", "Australia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Austral", "Australia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "AUT", "Austria")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "BAH", "Bahamas")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "BAR", "Barbados")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Barbado", "Barbados")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "BDI", "Burundi")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "BLR", "Belarus")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "BEL", "Belgium")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "BER", "Bermuda")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "BRA", "Brazil")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "BRN", "Brunei")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "BUL", "Bulgaria")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "CAN", "Canada")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "CAY", "Cayman")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "CHI", "Chile")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "CHN", "China")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "COL", "Colombia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Colombi", "Colombia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "CRC", "Costa Rica")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Costa R", "Costa Rica")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "CRO", "Croatia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "CYP", "Cyprus")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "CZE", "Czech Republic")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Czech R", "Czech Republic")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "DEN", "Denmark")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "DOM", "Dominican Republic")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Dominic", "Dominican Republic")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ECU", "Ecuador")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "EGY", "Egypt")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "El Salv", "El Salvador")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ESA", "El Salvador")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ESP", "Spain")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "EST", "Estonia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ETH", "Ethiopia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Ethiopi", "Ethiopia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Faroe I", "Faroe Islands")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "FIN", "Finland")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "FLK", "Falkland Islands")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "FRA", "France")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "GBR", "England")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "GER", "Germany")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "GRE", "Greece")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "GRN", "Greenland")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "GUA", "Guatemala")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Guatema", "Guatemala")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "HKG", "Hong Kong")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Hong Ko", "Hong Kong")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "HON", "Honduras")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Hondura", "Honduras")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "HUN", "Hungary")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "INA", "Indonesia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Indones", "Indonesia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "IND", "India")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "IRL", "Ireland")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ISL", "Iceland")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ISR", "Israel")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ITA", "Italy")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "JAM", "Jamaica")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "JPN", "Japan")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "JOR", "Jordan")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "KEN", "Kenya")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "KOR", "Korea")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Korea,", "Korea")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "KSA", "Saudi Arabia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "KUW", "Kuwait")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "LAT", "Latvia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "LIE", "Liechtenstein")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Liechte", "Liechtenstein")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Lithuan", "Lithuania")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "LTU", "Lithuania")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "LUX", "Luxembourg")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Luxembo", "Luxembourg")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Macao S", "Macao")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Macedon", "Macedonia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Malaysi", "Malaysia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "MAR", "Martinique")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Martini", "Martinique")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "MAS", "Malaysia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "MEX", "Mexico")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "MGL", "Mongolia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "MLT", "Malta")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "NCA", "Nicaragua")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "NED", "Netherlands")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Netherl", "Netherlands")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "New Zea", "New Zealand")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "NGR", "Nigeria")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "NOR", "Norway")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "NZL", "New Zealand")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "OMA", "Oman")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "PAK", "Pakistan")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Palesti", "Palestine")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "PAN", "Panama")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "PAR", "Paraguay")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Paragua", "Paraguay")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "PER", "Peru")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "PHI", "Philippines")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Philipp", "Philippines")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "POL", "Poland")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "POR", "Portugal")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Portuga", "Portugal")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Puerto", "Puerto Rico")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "QAT", "Qatar")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ROU", "Romania")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Saudi A", "Saudi Arabia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "SIN", "Singapore")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Singapo", "Singapore")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "SLO", "Slovenia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Slovaki", "Slovakia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Sloveni", "Slovenia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "SMR", "San Marino")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "South A", "South Africa")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "SRB", "Serbia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Sri Lan", "Sri Lanka")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "SUI", "Switzerland")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "SVK", "Slovakia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "SWE", "Sweden")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Switzer", "Switzerland")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "TCA", "Turks and Caicos")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "THA", "Thailand")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Thailan", "Thailand")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "TPE", "Taipei")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "TRI", "Trinidad")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Trinida", "Trinidad")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "TUR", "Turkey")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "TWN", "Taiwan")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "UAE", "United Arab Emirates")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "UGA", "Uganda")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "UKR", "Ukraine")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "United", "United States")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "URU", "Uruguay")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "USA", "United States")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "VEN", "Venezuela")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Venezue", "Venezuela")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "VGB", "Virgin Islands")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "VIE", "Vietnam")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "ZIM", "Zimbabwe")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "RSA", "South Africa")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "RUS", "Russia")) %>%
  mutate(country_residence = replace(country_residence, country_residence == "Russian", "Russia")) %>%
  filter(!is.na(display_name)) %>% 
  select(-file_name, -residence, -first_name, -last_name)

SUHANI

Geographical Analysis of Winners

Overview


We were interested in creating a map with the locations where winners from the Boston marathon over the past 120 year are from. Additionally, we wanted to examine how the locations of winners changed over time. We also analyzed data from winners from the wheel chair division and compared results to the men and women open divsion.

Data Cleaning


boston_df2 = boston_df %>% 
  filter(year > 1999) %>% 
  filter(overall == 1) %>% 
  filter(gender == "M") %>% 
  select(year, city, state, overall, everything()) %>% 
  drop_na(city) %>% 
  separate(city, into = c("city", "state", "country"), sep = ",") %>% 
  select(-country, -state) %>% 
  writexl::write_xlsx("interactive_map_men.xlsx")
boston_df3 = boston_df  %>% 
  filter(year > 1999) %>% 
  filter(gender_result == 1) %>% 
  filter(gender == "F") %>% 
  select(year, city, state, overall, everything()) %>% 
  drop_na(city) %>% 
  writexl::write_xlsx("interactive_map_women.xlsx")
map_df = read_excel("data/latitude_longitude_winners.xlsx", sheet = 1) %>% 
  select(year, city, latitude, longitude, age, gender, official_time, display_name) %>% 
  rename(place = city) 
map_df2 = read_excel("data/latitude_longitude_winners.xlsx", sheet = 2) %>% 
  select(year, city, latitude, longitude, age, gender, official_time, display_name) %>% 
  rename(place = city) 
men_open = read_excel("data/geo_winners.xlsx", sheet = 4) %>% 
  janitor::clean_names() %>% 
  rename(place = country) %>% 
  separate(official_time, into = c("data", "official_time"), sep = " (?=[^ ]+$)") %>% 
  select(-data) 
lat_long = read_excel("data/lat_long.xlsx") %>% 
  select(-country) %>% 
  rename(place = name)
men_merge <- merge(men_open,lat_long,by="place") %>% 
  rename(display_name = name) %>% 
  mutate(gender = "M") 
men_merge2 = men_merge %>% 
  mutate(age = NA) %>% 
  filter(year > 1900) %>% 
  filter(year < 2000) 
men_total = rbind(men_merge2, map_df)
women_open = read_excel("data/geo_winners.xlsx", sheet = 3) %>% 
  janitor::clean_names() %>% 
  rename(place = country) %>% 
  separate(official_time, into = c("data", "official_time"), sep = " (?=[^ ]+$)") %>% 
  select(-data) 
women_merge = merge(women_open, lat_long, by="place") %>% 
  rename(display_name = name) %>% 
  mutate(gender = "F") 
  
women_merge2 = women_merge %>% 
  mutate(age = NA) %>% 
  filter(year > 1900) %>% 
  filter(year < 2000) 
woman_total = rbind(women_merge2, map_df2) %>% 
  drop_na(latitude)
women_wheelchair = read_excel("data/geo_winners.xlsx", sheet = "women_wheel_chair") %>% 
    janitor::clean_names() %>% 
    separate(official_time, into = c("data", "official_time"), sep = " (?=[^ ]+$)") %>% 
    select(-data) %>% 
    mutate(gender = "F")
men_wheelchair = read_excel("data/geo_winners.xlsx", sheet = "men_wheel_chair") %>% 
    janitor::clean_names() %>% 
    separate(official_time, into = c("data", "official_time"), sep = " (?=[^ ]+$)") %>% 
    select(-data) %>% 
    mutate(gender = "M")
wheelchair_merge = rbind(women_wheelchair, men_wheelchair) %>% 
  rename(place = country)
wheelchair_total = merge(wheelchair_merge, lat_long, by="place") %>% 
  filter(!(year == 2013))
winners_bind = rbind(women_merge, men_merge)
  • We cleaned data by filtering for the winners from the men and women’s open division.
  • Some state data was wrongly included into the variable for city and was reorganized.
  • Because we were interested in created a map, needed to create latitude and longitude data for the state’s and city’s winners were from. To do this, exported and loaded our data into Excel and auto-populated the latitude and longitude using Excel’s “Geography” feature which can conver geography type data into latitude and longitude. We used the following instruction to covert the data: https://www.mrexcel.com/excel-tips/find-latitude-and-longitude-for-each-city-in-excel/.
  • Data on the city and/or state a participant was from was only available for the year 2000 and beyond. After graphing this data, we were displeased with the lack of data over time as we only had datapoints for the year 2000 and beyond. Thus, found an additional dataset that would provide information on the locations winners pre-2000 were from using this website: https://www.baa.org/races/boston-marathon/results/champions.
  • While our geography data included the countries that each winner was from, our origional dataset included more precise locations, the city and the state a participant was from as well as the participants age. Because of this, we used the data available from our origional dataset (the years 2000 and beyond) and combined the missing years for our new data set (pre-2000) to create a map of locations winners were from.
  • When importing our geography data, the official times of runners was imported as a date and a time. We separate this variable and deleted the incorrect protion of this variable.
  • Because our geograpy data only inlcuded countries were winners were from, we could not use Excel’s “Geography” feature. Instead, we found a dataset with the latitude and longitude of common countries throughout the world and merged this dataset by country to add latitude and longitude data.
  • To combine our origional data and our geography data, we used rbind to vertically merge the datasets. Prior to binding the datasets, we had to ensure the variables in both datasets were named the exact same. Because our origional dataset included age as a variable and we wanted to include it, where possible, in our map, we created a blank age variable in our geography dataset prior to binding.
  • In the fully binded dataset, we dropped any missing latitide or longitude values.
  • Using the geophraphy data, we cleaned the wheel chair data to a process similar to above. We additionally excluded the year 2013 due to the bombing.

Plots


Map of Male Winners

map_winners = leaflet(men_total) %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addMarkers(lat = ~latitude, lng = ~longitude,
  popup = paste("Name:", men_total$display_name, "<br>", "Year:", men_total$year,"<br>",  "Official Time:", men_total$official_time, "<br>", "Age:", men_total$age, "<br>", "Gender:", men_total$gender), clusterOptions = markerClusterOptions())
map_winners

The continent with the largest nuber of male winners from 1900 and beyond is North American, specifically the US, followed by Africa. There are 44 winners from the United States, 20 from Kenya, 10 from Japan.


Map of Female Winners

map_winners_women = leaflet(woman_total) %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addMarkers(lat = ~latitude, lng = ~longitude,
  popup = paste("Name:", woman_total$display_name, "<br>", "Year:", woman_total$year,"<br>",  "Official Time:", woman_total$official_time, "<br>", "Age:", woman_total$age, "<br>", "Gender:", woman_total$gender), clusterOptions = markerClusterOptions())
map_winners_women

This is in contrast to female winners from 1900 and beyond where Africa is the continent with the most winners, closely followed by Europe, and North America. Most of the female winners in Africa are from Kenya. The map for female winners suggests that a participants proximinity to Boston likely does not play a role in their probability of winning the marathon.


Plot of Officials Times of Winners by Year in Open Division

plot2 = winners_bind %>%
  mutate(text_label = 
           str_c("Name: ", display_name, "\nGender: ", gender)) %>% 
  plot_ly(
    x = ~year, y = ~official_time, text = ~text_label, color = ~place,
    type = "scatter") %>% 
  layout(
    title = "Official Times of Male and Female Winners each Year by Winner's Country",
    xaxis = list(title = 'Year'),
    yaxis = list(title = 'Marathon Time'), 
    legend = list(title=list(text='Residence')))
plot2

Over time, the winning time for both male and female winners has declined. While prior to 1950, most of the winners were from the United States, most of the winners are from Kenya. Additionally, we can see that the lowest marathon times have been from people from Kenya.


Map of Winners in Wheel Chair Division

map_wheelchair = leaflet(wheelchair_total) %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addMarkers(lat = ~latitude, lng = ~longitude,
  popup = paste("Name:", wheelchair_total$name, "<br>", "Year:", wheelchair_total$year,"<br>", "Official Time:", wheelchair_total$official_time, "<br>", "Gender:", wheelchair_total$gender), clusterOptions = markerClusterOptions())
map_wheelchair

When assessing wheel chair division map, we can see that 25 winners are from the United States, followed by 19 from Europe, and 10 from Africa. While the male and female open division lack winners from Switzerland, there are 15 winners in the wheel chair division from Switzerland.


Plot of Official Times of Winners by Year in Wheel Chair Division

plot3 = wheelchair_total %>%
  mutate(text_label = 
           str_c("Name: ", name, "\nGender: ", gender)) %>% 
  plot_ly(
    x = ~year, y = ~official_time, text = ~text_label, color = ~place,
    type = "scatter") %>% 
  layout(
    title = 'Official Times of Winners each Year in Wheel Chair Division',
    xaxis = list(title = 'Year'),
    yaxis = list(title = 'Marathon Time'), 
    legend = list(title=list(text='Residence')))
plot3

While there is a downward trend in official times for the open division, there is no trend in official times over time. Many of the winners pre-2000 are from the United States while many of the winners post-2000 are from South Africa. The lowest time recorded is from an individual in Switzerland.


Weather Analysis

Data Cleaning


weather = read_excel("data/weather_conditions.xlsx") %>% 
  janitor::clean_names() %>% 
  filter(!(year == 2013)) %>% 
  separate(wind, into = c("wind_direction", "wind_speed"), sep = "\\s")  %>% 
  separate(wind_speed, into = c("wind_min", "wind_max"), sep = "-") %>% 
  mutate(wind_min = as.numeric(wind_min)) %>% 
  mutate(wind_max = as.numeric(wind_max)) %>% 
  mutate(wind_speed_average = (wind_min + wind_max)/2)
weather2 = weather %>% 
  select(boston_temp, year, wind_speed_average, sky) %>% 
  filter(!(year == 2021)) 
winners_bind2 = winners_bind  %>% 
   filter(year > 1999) 
boston_df2 = boston_df %>% 
     filter(year > 1999) 
  
merge_weather2 <- merge(weather2,winners_bind2,by="year")
merge_weather_participants = merge(weather2,boston_df2,by="year") %>% 
  mutate(boston_temp = as.numeric(boston_temp)) %>% 
  mutate(seconds = as.numeric(seconds)) 
  • We cleaned data by cleaning up variable names using janitor::clean_names. We also filter out for 2013, the year of the bombing.
  • We sparated the wind variable into direction, min, and max and created an additional variable that takes the average of the wind min and max speed.
  • We merged this dataset with a dataset with official times to see the relationship between champion winning time and weather.

Plots


Map of Weather during Boston Marathon

plot4 = weather %>%
  mutate(text_label = 
           str_c("Wind Speed Average: ", wind_speed_average)) %>%
  plot_ly(x = ~year, y = ~boston_temp, text = ~text_label, color= ~ sky, size = ~wind_speed_average,
          type = "scatter", mode = "markers",  colors = "viridis", 
          sizes = c(50, 700), marker = list(opacity = 0.7)) %>%
  layout(
    title = 'Weather over Time',
    xaxis = list(title = 'Year'),
    yaxis = list(title = 'Boston Temperature'), 
    legend = list(title=list(text='Sky')))
plot4

*Note datapoint size is related to wind speed.

Over the past 20 years, the temperature in Boston has generally stayed between 40-70 degrees and is usually a clear day. In 2012, there was a particularly hot marathon day with the tempature in the high 80’s. Over the past few years, there has been relativly low wind speeds on average with exception to 2015.


Wind Analysis during Boston Marathon

ggplot(data = weather)+
  geom_segment(aes(x = year, xend = year, y = wind_min, yend = wind_max, colour = wind_direction), size = 5, alpha = 0.6) + 
  labs(
    title = "Wind Speed During Boston's Marathon Over Time",
    x = "Year",
    y = "Wind Speed Range (mph)",
    color='Wind Direction')

In the early 2000’s, the wind direction was generally toward the North/north east and recently has been generally toward the west, northwest direction. There has been relatively low wind speed with little wind speed varability the past few years during the Boston marathon. Around 2010, there are relatively higher wind speeds. In 2007, there was a particularly windy marathon with speeds ranging from 20-30 mph.


Comparison between Chamption Times and Weather

plot5 = merge_weather2 %>%
  mutate(text_label = 
           str_c("Name: ", display_name, "\nYear: ", year, "\nResidence: place")) %>% 
 plot_ly(
    x = ~boston_temp, y = ~official_time,  text = ~text_label, color = ~sky, size = ~wind_speed_average,
          type = "scatter", mode = "markers",  colors = "viridis", 
          sizes = c(50, 700), marker = list(opacity = 0.7)) %>%
  layout(
    title = 'Champion Times and Weather',
    xaxis = list(title = "Boston's Temperature"),
    yaxis = list(title = 'Official Time'), 
    legend = list(title=list(text='Sky')))
plot5

*Note datapoint size is related to wind speed.

The fastest marathon times in the past 20 years have occured when the temperature is around 55 degrees for men and around 60 degrees for woman. At temperatures 55 and below as well as 85 and above, there tends to be higher wind speeds. Although there are only a few datapoints above 70 degrees, one may hypothesize from the graph that as temperature increases, the fastest marathon time also increases.


Comparison between Participants Times and Weather

plot6 = merge_weather_participants %>% 
   plot_ly(x = ~boston_temp, y = ~seconds, color = ~sky, type = "box") %>% 
  layout(
    title = 'Participants Times by Temperature',
    xaxis = list(title = "Boston's Temperature"),
    yaxis = list(title = 'Official Time (seconds)'), 
    legend = list(title=list(text='Sky')))
plot6

As temperature increases, the median marathon time among participants also increases also slightly increases. Furthermore, the lowested median marathon times occur around 55 degrees. We also see that there are many outliers in the higher timepoints.

TAMARA

CLAIRE

Prepare age data

boston_df_age_plot =
  boston_df %>% 
  select(age) %>%
  drop_na(age)

Age distribution overall (counts)

This is plotly:

age_plotly_original = 
boston_df_age_plot %>% 
  count(age) %>% 
 plot_ly(
      x = ~age, y = ~n, color = ~age,
      type = "bar", colors = "viridis") %>% 
  layout(
    xaxis = list(title = "Age"),
    yaxis = list(title = "Number of Participants")
)

This is plain ggplot:

age_ggplot = 
ggplot(boston_df_age_plot, aes(x = age)) +
  geom_bar(fill = "cornflowerblue", 
                 color = "white") +
  labs(title = "Participants by age", 
       subtitle = "binwidth = 5 years",
       x = "Age", y = "Number of participants")

This is plotly from ggplot:

ggplotly(age_ggplot)

Histogram of percentages by age

age_percentage_ggplot = 
  ggplot(boston_df_age_plot, 
       aes(x = age, 
           y = ..count.. / sum(..count..))) +
  geom_histogram(fill = "cornflowerblue", 
                 color = "white", 
                 binwidth = 5) + 
  labs(title = "Participants by age", 
       y = "Percent",
       x = "Age") +
  scale_y_continuous(labels = percent)

plotly:

ggplotly(age_percentage_ggplot)

Density plot ages

density_plot = 
boston_df_age_plot %>%
  ggplot(aes(x = age)) +
    geom_density(fill = "cornflowerblue", color = "black", alpha = 0.8) +
    ggtitle("Distribution of age")
ggplotly(density_plot)

DATA CLEAN GENDER CODE

? not sure if we want to put this at top data clean maybe need to decide how much it affects other plots?

boston_df_gender =
    boston_df %>%
    select(gender) %>%
    mutate(gender = na_if(gender, "U"), 
            gender = recode(gender, m = "M"),
           gender = factor(gender, levels = c("M", "F"), labels = c("male", "female"))) %>% 
  drop_na(gender) %>% 
  count(gender)

Participants by gender (%’s)

bar_graph_gender = boston_df_gender %>%
  mutate(pct = n / sum(n),
         pctlabel = paste0(round(pct*100), "%"))
# plot the bars as percentages, 
# in decending order with bar labels
ggplot(bar_graph_gender, 
       aes(x = reorder(gender, -pct),
           y = pct)) + 
  geom_bar(stat = "identity", 
           fill = "indianred3", 
           color = "black") +
  geom_text(aes(label = pctlabel), 
            vjust = -0.25) +
  scale_y_continuous(labels = percent) +
  labs(x = "Gender", 
       y = "Percent", 
       title  = "Participants by gender")

Participants by gender (counts)

bar_graph_frequency = 
ggplot(bar_graph_gender, 
       aes(x = gender, 
           y = n)) + 
  geom_bar(stat = "identity", 
           fill = "indianred3",
           color = "black") +
  geom_text(aes(label = n), 
            vjust = -0.5) +
 scale_y_continuous(labels = scales::comma) +
  labs(x = "Gender", 
       y = "Frequency", 
       title  = "Participants by gender")

SYDNEY

© 2021 GitHub, Inc. Terms Privacy